Introduction

This IPython notebook illustrates how to update attribute types and generate features for blocking/matching manually.

First, we need to import py_entitymatching package and other libraries as follows:



In [1]:

    
# Import py_entitymatching package
import py_entitymatching as em
import os
import pandas as pd

Then, read the (sample) input tables for blocking purposes.



In [2]:

    
# Get the datasets directory
datasets_dir = em.get_install_path() + os.sep + 'datasets'

# Get the paths of the input tables
path_A = datasets_dir + os.sep + 'person_table_A.csv'
path_B = datasets_dir + os.sep + 'person_table_B.csv'



In [3]:

    
# Read the CSV files and set 'ID' as the key attribute
A = em.read_csv_metadata(path_A, key='ID')
B = em.read_csv_metadata(path_B, key='ID')

Getting Attribute Types



In [4]:

    
atypes1 = em.get_attr_types(A)
atypes2 = em.get_attr_types(B)



In [5]:

    
atypes1.keys()









    Out[5]:





dict_keys(['ID', 'zipcode', '_table', 'name', 'hourly_wage', 'address', 'birth_year'])



In [6]:

    
atypes1['birth_year'], atypes1['hourly_wage'], atypes1['address'], atypes1['name'], atypes1['zipcode']









    Out[6]:





('numeric', 'numeric', 'str_bt_1w_5w', 'str_bt_1w_5w', 'numeric')



In [7]:

    
atypes2['birth_year'], atypes2['hourly_wage'], atypes2['address'], atypes2['name'], atypes2['zipcode']









    Out[7]:





('numeric', 'numeric', 'str_bt_5w_10w', 'str_bt_1w_5w', 'numeric')

Updating Attribute Types



In [8]:

    
atypes1['address'], atypes2['address']









    Out[8]:





('str_bt_1w_5w', 'str_bt_5w_10w')



In [9]:

    
atypes1['address'] = 'str_bt_1w_5w'
atypes2['address'] = 'str_bt_1w_5w'

Getting Attribute Correspondences



In [10]:

    
block_c = em.get_attr_corres(A, B)



In [11]:

    
block_c.keys()









    Out[11]:





dict_keys(['corres', 'rtable', 'ltable'])



In [12]:

    
id(A), id(block_c['ltable']), id(B), id(block_c['rtable'])









    Out[12]:





(4509225032, 4509225032, 4509225816, 4509225816)



In [13]:

    
block_c['corres']









    Out[13]:





[('ID', 'ID'),
 ('name', 'name'),
 ('birth_year', 'birth_year'),
 ('hourly_wage', 'hourly_wage'),
 ('address', 'address'),
 ('zipcode', 'zipcode')]

Updating Attribute Correspondences



In [14]:

    
block_c['corres'] = [('name', 'name'),
 ('birth_year', 'birth_year'),
 ('hourly_wage', 'hourly_wage'),
 ('address', 'address'),
 ('zipcode', 'zipcode')]

Getting Tokenizers



In [23]:

    
# for blocking
tok = em.get_tokenizers_for_blocking() 
# for matching 
#tok = em.get_tokenizers_for_matching()



In [16]:

    
tok









    Out[16]:





{'alphabetic': <function py_entitymatching.feature.tokenizers.tok_alphabetic>,
 'alphanumeric': <function py_entitymatching.feature.tokenizers.tok_alphanumeric>,
 'dlm_dc0': <function py_entitymatching.feature.tokenizers._make_tok_delim.<locals>.tok_delim>,
 'qgm_2': <function py_entitymatching.feature.tokenizers._make_tok_qgram.<locals>.tok_qgram>,
 'qgm_3': <function py_entitymatching.feature.tokenizers._make_tok_qgram.<locals>.tok_qgram>,
 'wspace': <function py_entitymatching.feature.tokenizers.tok_wspace>}

Getting Similarity Functions



In [22]:

    
#for blocking
sim = em.get_sim_funs_for_blocking()

#for matching
#sim = em.get_sim_funs_for_matching()



In [18]:

    
sim









    Out[18]:





{'abs_norm': <function py_entitymatching.feature.simfunctions.abs_norm>,
 'affine': <function py_entitymatching.feature.simfunctions.affine>,
 'cosine': <function py_entitymatching.feature.simfunctions.cosine>,
 'dice': <function py_entitymatching.feature.simfunctions.dice>,
 'exact_match': <function py_entitymatching.feature.simfunctions.exact_match>,
 'hamming_dist': <function py_entitymatching.feature.simfunctions.hamming_dist>,
 'hamming_sim': <function py_entitymatching.feature.simfunctions.hamming_sim>,
 'jaccard': <function py_entitymatching.feature.simfunctions.jaccard>,
 'jaro': <function py_entitymatching.feature.simfunctions.jaro>,
 'jaro_winkler': <function py_entitymatching.feature.simfunctions.jaro_winkler>,
 'lev_dist': <function py_entitymatching.feature.simfunctions.lev_dist>,
 'lev_sim': <function py_entitymatching.feature.simfunctions.lev_sim>,
 'monge_elkan': <function py_entitymatching.feature.simfunctions.monge_elkan>,
 'needleman_wunsch': <function py_entitymatching.feature.simfunctions.needleman_wunsch>,
 'overlap_coeff': <function py_entitymatching.feature.simfunctions.overlap_coeff>,
 'rel_diff': <function py_entitymatching.feature.simfunctions.rel_diff>,
 'smith_waterman': <function py_entitymatching.feature.simfunctions.smith_waterman>}

Getting Features



In [19]:

    
feature_table = em.get_features(A, B, atypes1, atypes2, block_c, tok, sim)



In [20]:

    
feature_table[feature_table.left_attribute == 'address']









    Out[20]:






  
    
      
      feature_name
      left_attribute
      right_attribute
      left_attr_tokenizer
      right_attr_tokenizer
      simfunction
      function
      function_source
      is_auto_generated
    
  
  
    
      16
      address_address_jac_qgm_3_qgm_3
      address
      address
      qgm_3
      qgm_3
      jaccard
      <function address_address_jac_qgm_3_qgm_3 at 0x10f959c80>
      from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...
      True
    
    
      17
      address_address_cos_dlm_dc0_dlm_dc0
      address
      address
      dlm_dc0
      dlm_dc0
      cosine
      <function address_address_cos_dlm_dc0_dlm_dc0 at 0x10f959d08>
      from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...
      True
    
    
      18
      address_address_jac_dlm_dc0_dlm_dc0
      address
      address
      dlm_dc0
      dlm_dc0
      jaccard
      <function address_address_jac_dlm_dc0_dlm_dc0 at 0x10f959d90>
      from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...
      True
    
    
      19
      address_address_mel
      address
      address
      None
      None
      monge_elkan
      <function address_address_mel at 0x10f959e18>
      from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...
      True
    
    
      20
      address_address_lev_dist
      address
      address
      None
      None
      lev_dist
      <function address_address_lev_dist at 0x10f959ea0>
      from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...
      True
    
    
      21
      address_address_lev_sim
      address
      address
      None
      None
      lev_sim
      <function address_address_lev_sim at 0x10f959f28>
      from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...
      True
    
    
      22
      address_address_nmw
      address
      address
      None
      None
      needleman_wunsch
      <function address_address_nmw at 0x10f9bb048>
      from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...
      True
    
    
      23
      address_address_sw
      address
      address
      None
      None
      smith_waterman
      <function address_address_sw at 0x10f9bb0d0>
      from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...
      True



In [21]:

    
type(feature_table)









    Out[21]:





pandas.core.frame.DataFrame

	feature_name	left_attribute	right_attribute	left_attr_tokenizer	right_attr_tokenizer	simfunction	function	function_source	is_auto_generated
16	address_address_jac_qgm_3_qgm_3	address	address	qgm_3	qgm_3	jaccard	<function address_address_jac_qgm_3_qgm_3 at 0x10f959c80>	from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...	True
17	address_address_cos_dlm_dc0_dlm_dc0	address	address	dlm_dc0	dlm_dc0	cosine	<function address_address_cos_dlm_dc0_dlm_dc0 at 0x10f959d08>	from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...	True
18	address_address_jac_dlm_dc0_dlm_dc0	address	address	dlm_dc0	dlm_dc0	jaccard	<function address_address_jac_dlm_dc0_dlm_dc0 at 0x10f959d90>	from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...	True
19	address_address_mel	address	address	None	None	monge_elkan	<function address_address_mel at 0x10f959e18>	from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...	True
20	address_address_lev_dist	address	address	None	None	lev_dist	<function address_address_lev_dist at 0x10f959ea0>	from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...	True
21	address_address_lev_sim	address	address	None	None	lev_sim	<function address_address_lev_sim at 0x10f959f28>	from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...	True
22	address_address_nmw	address	address	None	None	needleman_wunsch	<function address_address_nmw at 0x10f9bb048>	from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...	True
23	address_address_sw	address	address	None	None	smith_waterman	<function address_address_sw at 0x10f9bb0d0>	from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...	True